Code
library(ggplot2)
library(plotly)
library(tidyverse)
library(dplyr)
library(parcoords)
library(d3r)
library(GGally)
library(forcats)
library(redav)library(ggplot2)
library(plotly)
library(tidyverse)
library(dplyr)
library(parcoords)
library(d3r)
library(GGally)
library(forcats)
library(redav)df <- read.csv("Data_Cricket_Data_by_season_all.csv")
df$Country <- sub(".*\\((.*?)\\).*", "\\1",df$Player)
df$Player <- sub("\\(.*?\\)", paste0(""),df$Player)
df$NotOut <- grepl("\\*", df$HS)
# Convert logical values to "Yes" and "No"
df$NotOut <- ifelse(df$NotOut, "Yes", "No")
df$HS <- gsub("\\*", "", df$HS)
# Assuming df is your dataframe and Season is the column with season values
df$start_year <- (sub("/.*", "", df$Season)) # Extract start year
# For cases like "yyyy/yy" or "yyyy/yy", extract start and end years
if (any(grepl("/", df$Season))) {
df$start_year <- (sub("/.*", "", df$Season))
df$end_year <- (paste0(substr(df$Season,1,2),
sub(".*?/(\\d+)$", "\\1", df$Season)))
} else {
# For single-value seasons, start and end years are the same
df$end_year <- df$start_year
}
df$end_year <- gsub("1900", "2000", df$end_year)
df$end_year[df$end_year %in% grep("^.{6}$", df$end_year, value = TRUE)] <- substr(df$end_year[df$end_year %in% grep("^.{6}$", df$end_year, value = TRUE)], 3, 6)
# Convert start_year and end_year to integer
df$Season <- NULL
df <- na.omit(df)
df[] <- lapply(df, function(x) gsub("-", "0", x))
df_Ind <- df[df$Country=="IND" | df$Country=="ICC/IND",]
df1 <- df_Ind %>%
group_by(start_year) %>%
summarise(total_runs = sum(as.numeric(RunsDescending)))
odi_stats_ind <- read.csv("Data_India_ODI_stats.csv")
df1$ave_runs <- df1$total_runs/odi_stats_ind$PLD###################Plot 1##################################
ggplot(df1, aes(x=as.numeric(start_year), y=ave_runs))+
geom_line()+
geom_point()+
labs(x="Year", y="Average Runs (by Matches Played)")+
scale_x_continuous(breaks=seq(1970,2025,2))###################Plot 1##################################names(df)[15] <- "start"
names(df)[16] <- "end"
names(df)[5] <- "Runs"
decade <- cut(as.numeric(df$start),
breaks = seq(1970,2030,by=10),
dig.lab = 4,
include.lowest = TRUE,
right=FALSE)
df$Decade <- decade
filtered_10_20 <- df %>%
filter(Decade == "[2010,2020)") %>%
group_by(Player, Country) %>%
summarise(Runs = sum(as.numeric(Runs)), decade_avg_str = mean(as.numeric(SR)),
decade_avg_ba = mean(as.numeric(Ave)))
filtered_10_20 <- filtered_10_20[order(filtered_10_20$Runs,
decreasing = TRUE),]library(plotly)
## Add ggplotly later on!
ggparcoord(filtered_10_20[1:20,],
columns=c(3:5),
groupColumn = 1, showPoints = TRUE)icc_2011 <- read.csv("Data_ICC_2011_F.csv")
icc_2011$match_id <- NULL
icc_2011$wicket_type[icc_2011$wicket_type==""] <- NA
icc_2011$player_dismissed[icc_2011$player_dismissed==""] <- NA
icc_2011$over <- floor(icc_2011$ball)
over_intervals <- cut(icc_2011$over, breaks=seq(0,50,10),
include.lowest = TRUE,
right = FALSE)
icc_2011$binned_overs <- over_intervals
df_2011_p1 <- icc_2011 %>%
group_by(binned_overs, innings) %>%
summarize(runs = sum(runs_off_bat)+sum(extras),
n_wickets = sum(!is.na(wicket_type)))
################ Plot Type 1 #####################
df_2011_p1 %>%
pivot_longer(innings) %>%
ggplot(aes(x=binned_overs, y=runs, fill=factor(value)))+
geom_col(position="dodge")+
scale_x_discrete(breaks=c("[0,10)", "[10,20)","[20,30)", "[30,40)","[40,50]"),
labels=c("1-10", "11-20", "21-30", "31-40", "41-50"))+
scale_y_continuous(breaks=seq(0,100,10))+
labs(title = "",
x = "Overs",
y = "Runs Scores",
fill = "Innings")+
geom_point(aes(y=runs),
position=position_dodge(width=0.85),
size=df_2011_p1$n_wickets*5, alpha=0.2)+
geom_text(vjust = 0, hjust = 0.64, label=df_2011_p1$n_wickets,
position = position_dodge(width = 0.80),
size=3)################ Plot Type 1 #####################
################ Plot Type 2 #####################
inning_2 <- icc_2011[icc_2011$innings==2,]
df_2011_p2a <- inning_2 %>%
group_by(binned_overs, striker, bowler) %>%
summarise(runs_scored=sum(runs_off_bat))
ggplot(df_2011_p2a, aes(y=bowler, x=runs_scored, fill=binned_overs))+
geom_col()+
facet_wrap(~striker)+
xlab("Runs Scored")+
ylab("Bowler")+
ggtitle("Runs scored by Indian Batsmen in front of Sri-Lankan Bowlers")# Plot 2
inning_1 <- icc_2011[icc_2011$innings==1,]
df_2011_p2b <- inning_1 %>%
group_by(binned_overs, striker, bowler) %>%
summarise(runs_scored=sum(runs_off_bat))
ggplot(df_2011_p2b, aes(y=bowler, x=runs_scored, fill=binned_overs))+
geom_col()+
facet_wrap(~striker)+
xlab("Runs Scored")+
ylab("Bowler")+
ggtitle("Runs scored by Sri-Lankan Batsmen in front of Indian Bowlers")################ Plot Type 2 #####################icc_2015 <- read.csv("Data_ICC_2015_F.csv")
icc_2015$match_id <- NULL
icc_2015$wicket_type[icc_2015$wicket_type==""] <- NA
icc_2015$player_dismissed[icc_2015$player_dismissed==""] <- NA
icc_2015$over <- floor(icc_2015$ball)
over_intervals <- cut(icc_2015$over, breaks=seq(0,50,10),
include.lowest = TRUE,
right = FALSE)
icc_2015$binned_overs <- over_intervals
df_2015_p1 <- icc_2015 %>%
group_by(binned_overs, innings) %>%
summarize(runs = sum(runs_off_bat)+sum(extras),
n_wickets = sum(!is.na(wicket_type)))
df_2015_p1 %>%
pivot_longer(innings) %>%
ggplot(aes(x=binned_overs, y=runs, fill=factor(value)))+
geom_col(position="dodge")+
scale_x_discrete(breaks=c("[0,10)", "[10,20)","[20,30)", "[30,40)","[40,50]"),
labels=c("1-10", "11-20", "21-30", "31-40", "41-50"))+
scale_y_continuous(breaks=seq(0,100,10))+
labs(title = "",
x = "Overs",
y = "Runs Scores",
fill = "Innings")+
geom_point(aes(y=runs),
position=position_dodge(width=0.85),
size=df_2015_p1$n_wickets*5, alpha=0.2)+
geom_text(vjust = 0, hjust = 0.64, label=df_2015_p1$n_wickets,
position = position_dodge(width = 0.80),
size=3)################ Plot Type 1 #####################
################ Plot Type 2 #####################
# Plot 1
inning_2 <- icc_2015[icc_2015$innings==2,]
df_2015_p2a <- inning_2 %>%
group_by(binned_overs, striker, bowler) %>%
summarise(runs_scored=sum(runs_off_bat))
ggplot(df_2015_p2a, aes(y=bowler, x=runs_scored, fill=binned_overs))+
geom_col()+
facet_wrap(~striker)+
xlab("Runs Scored")+
ylab("Bowler")+
ggtitle("Runs scored by Australian Batsmen in front of New Zealand Bowlers")# Plot 2
inning_1 <- icc_2015[icc_2015$innings==1,]
df_2015_p2b <- inning_1 %>%
group_by(binned_overs, striker, bowler) %>%
summarise(runs_scored=sum(runs_off_bat))
ggplot(df_2015_p2b, aes(y=bowler, x=runs_scored, fill=binned_overs))+
geom_col()+
facet_wrap(~striker)+
xlab("Runs Scored")+
ylab("Bowler")+
ggtitle("Runs scored by New Zealand Batsmen in front of Australian Bowlers")################ Plot Type 2 #####################
####################### ICC 2015 Final ############################################# ICC 2019 Final ######################
####### Preprocessing #######
icc_2019 <- read.csv("Data_ICC_2019_F.csv")
icc_2019$match_id <- NULL
icc_2019$wicket_type[icc_2019$wicket_type==""] <- NA
icc_2019$player_dismissed[icc_2019$player_dismissed==""] <- NA
icc_2019$over <- floor(icc_2019$ball)
over_intervals <- cut(icc_2019$over, breaks=seq(0,50,10),
include.lowest = TRUE,
right = FALSE)
icc_2019$binned_overs <- over_intervals
####### Preprocessing #######
# Comment: These missing values don't indicate lack of data but are logical in nature
################ Plot Type 1 #####################
df_2019_p1 <- icc_2019 %>%
group_by(binned_overs, innings) %>%
summarize(runs = sum(runs_off_bat)+sum(extras),
n_wickets = sum(!is.na(wicket_type)))
df_2019_p1 %>%
pivot_longer(innings) %>%
ggplot(aes(x=binned_overs, y=runs, fill=factor(value)))+
geom_col(position="dodge")+
scale_x_discrete(breaks=c("[0,10)", "[10,20)","[20,30)", "[30,40)","[40,50]"),
labels=c("1-10", "11-20", "21-30", "31-40", "41-50"))+
scale_y_continuous(breaks=seq(0,100,10))+
labs(title = "",
x = "Overs",
y = "Runs Scores",
fill = "Innings")+
geom_point(aes(y=runs),
position=position_dodge(width=0.85),
size=df_2019_p1$n_wickets*5, alpha=0.2)+
geom_text(vjust = 0, hjust = 0.64, label=df_2019_p1$n_wickets,
position = position_dodge(width = 0.80),
size=3)################ Plot Type 1 #####################
################ Plot Type 2 #####################
# Plot 1
inning_2 <- icc_2019[icc_2019$innings==2,]
df_2019_p2a <- inning_2 %>%
group_by(binned_overs, striker, bowler) %>%
summarise(runs_scored=sum(runs_off_bat))
ggplot(df_2019_p2a, aes(y=bowler, x=runs_scored, fill=binned_overs))+
geom_col()+
facet_wrap(~striker)+
xlab("Runs Scored")+
ylab("Bowler")+
ggtitle("Runs scored by New Zealand Batsmen in front of England Bowlers")# Plot 2
inning_1 <- icc_2019[icc_2019$innings==1,]
df_2019_p2b <- inning_1 %>%
group_by(binned_overs, striker, bowler) %>%
summarise(runs_scored=sum(runs_off_bat))
ggplot(df_2019_p2b, aes(y=bowler, x=runs_scored, fill=binned_overs))+
geom_col()+
facet_wrap(~striker)+
xlab("Runs Scored")+
ylab("Bowler")+
ggtitle("Runs scored by England Batsmen in front of New Zealand Bowlers")################ Plot Type 2 #####################
####################### ICC 2019 Final ############################################# ICC 2023 Final ######################
####### Preprocessing #######
icc_2023 <- read.csv("Data_ICC_2023_F.csv")
icc_2023$match_id <- NULL
icc_2023$wicket_type[icc_2023$wicket_type==""] <- NA
icc_2023$player_dismissed[icc_2023$player_dismissed==""] <- NA
icc_2023$over <- floor(icc_2023$ball)
over_intervals <- cut(icc_2023$over, breaks=seq(0,50,10),
include.lowest = TRUE,
right = FALSE)
icc_2023$binned_overs <- over_intervals
####### Preprocessing #######
################ Plot Type 1 #####################
df_2023_p1 <- icc_2023 %>%
group_by(binned_overs, innings) %>%
summarize(runs = sum(runs_off_bat)+sum(extras),
n_wickets = sum(!is.na(wicket_type)))
df_2023_p1 %>%
pivot_longer(innings) %>%
ggplot(aes(x=binned_overs, y=runs, fill=factor(value)))+
geom_col(position="dodge")+
scale_x_discrete(breaks=c("[0,10)", "[10,20)","[20,30)", "[30,40)","[40,50]"),
labels=c("1-10", "11-20", "21-30", "31-40", "41-50"))+
scale_y_continuous(breaks=seq(0,100,10))+
labs(title = "",
x = "Overs",
y = "Runs Scores",
fill = "Innings")+
geom_point(aes(y=runs),
position=position_dodge(width=0.85),
size=df_2023_p1$n_wickets*5, alpha=0.2)+
geom_text(vjust = 0, hjust = 0.64, label=df_2023_p1$n_wickets,
position = position_dodge(width = 0.80),
size=3)################ Plot Type 1 #####################
################ Plot Type 2 #####################
# Plot 1
inning_2 <- icc_2023[icc_2023$innings==2,]
df_2023_p2a <- inning_2 %>%
group_by(binned_overs, striker, bowler) %>%
summarise(runs_scored=sum(runs_off_bat))
ggplot(df_2023_p2a, aes(y=bowler, x=runs_scored, fill=binned_overs))+
geom_col()+
facet_wrap(~striker)+
xlab("Runs Scored")+
ylab("Bowler")+
ggtitle("Runs scored by Australian Batsmen in front of Indian Bowlers")# Plot 2
inning_1 <- icc_2023[icc_2023$innings==1,]
df_2023_p2b <- inning_1 %>%
group_by(binned_overs, striker, bowler) %>%
summarise(runs_scored=sum(runs_off_bat))
ggplot(df_2023_p2b, aes(y=bowler, x=runs_scored, fill=binned_overs))+
geom_col()+
facet_wrap(~striker)+
xlab("Runs Scored")+
ylab("Bowler")+
ggtitle("Runs scored by Indian Batsmen in front of Australian Bowlers")################ Plot Type 2 #####################
####################### ICC 2023 Final ######################